All the studies done here are based on Financial Sector. The datasets used are FinancialPhraseBank, IBM Stock Market Data, Online Retail purchase data and credit card purchase history.
Finanacial Sentiment Analysis is requied to understand the general opinion of news headlines, which in turn could impact stock price, company decisions ..etc.
Dataset (FinancialPhraseBank) contains the sentiments for financial news headlines from the perspective of a retail investor. The dataset contains two columns, "Sentiment" and "News Headline". The sentiment can be negative, neutral or positive(3 - Categories).
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
# ! pip install missingno
url = "https://raw.githubusercontent.com/toshihiroryuu/Dataset-test/main/Finanacial_News_Sentiment.csv"
df = pd.read_csv(url, encoding='ISO-8859-1', names=["sentiment", "text"])
df.head(5)
# identify the size of data frame
print("Size of the data frame is",df.size)
# identify the shape of the data frame
print("Shape of the data frame is",df.shape)
print("No of Rows = ", df.shape[0])
print("No of Columns = ", df.shape[1])
df.describe()
missing_bar = msno.bar(df, figsize=(6, 3), fontsize=12, color='magenta')
print(missing_bar)
No missing values
# Balance of classes
class_dist = df['sentiment'].value_counts(normalize=True)
class_dist
plt.pie(class_dist.values, labels = class_dist.index,
startangle=90, autopct='%1.1f%%')
plt.title('Distribution of Sentiment')
plt.show()
temp = []
#Splitting pd.Series to list
data_to_list = df['text'].values.tolist()
for i in range(len(data_to_list)):
temp.append(data_to_list[i])
list(temp[:5])
# ! pip install python-Levenshtein
# Basic Preprocessing to remove punctuation
import gensim
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(temp))
print(data_words[:2])
#combine tokens to genetate sentences
from nltk.tokenize.treebank import TreebankWordDetokenizer
def detokenize(text):
return TreebankWordDetokenizer().detokenize(text)
data = []
for i in range(len(data_words)):
data.append(detokenize(data_words[i]))
print(data[:2])
data = np.array(data)
import tensorflow as tf
labels = np.array(df['sentiment'])
y = []
for i in range(len(labels)):
if labels[i] == 'neutral':
y.append(0)
if labels[i] == 'negative':
y.append(1)
if labels[i] == 'positive':
y.append(2)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 3, dtype="float32")
labels
len(labels)
from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop,Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
max_words = 5000
max_len = 200
#Toeknise and pad sequence.
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
info = pad_sequences(sequences, maxlen=max_len)
print(info)
print(labels)
from sklearn.model_selection import train_test_split
# Train test split
X_train, X_test, Y_train, Y_test = train_test_split(info, labels, test_size = 0.2, random_state=42)
print (len(X_train),len(X_test),len(Y_train),len(Y_test))
#Define model
model = Sequential()
model.add(layers.Embedding(max_words, 20))
model.add(layers.LSTM(15, dropout=0.5))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy', metrics=['accuracy'])
# Checkpoints
checkpoint = ModelCheckpoint("best_model.hdf5",
monitor='val_accuracy',
verbose=1,save_best_only=True,
mode='auto', period=1,
save_weights_only=False)
history = model.fit(X_train, Y_train,
epochs = 25,validation_data=(X_test, Y_test),
callbacks=[checkpoint])
import keras
# load the saved model.
best_model = keras.models.load_model("best_model.hdf5")
test_loss, test_acc = best_model.evaluate(X_test, Y_test, verbose=2)
print('Model accuracy: ', test_acc)
predictions = best_model.predict(X_test)
predictions
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(Y_test.argmax(axis=1),
np.around(predictions, decimals=0).argmax(axis=1))
matrix
import seaborn as sns
conf_matrix = pd.DataFrame(matrix, index = ['Neutral','Negative','Positive'],
columns = ['Neutral','Negative','Positive'])
#Normalizing
conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)
plt.figure()
sns.heatmap(conf_matrix, annot=True)
plt.show()
sentiment = ['Neutral','Negative','Positive']
sequence = tokenizer.texts_to_sequences(['With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]
sequence = tokenizer.texts_to_sequences(['The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]
sequence = tokenizer.texts_to_sequences(['According to Gran , the company has no plans to move all production to Russia'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]
Dataset summarizes the usage behavior of about 9000 active credit card holders during the last 6 months.
Dataset URL : https://www.kaggle.com/arjunbhasin2013/ccdata
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
# ! pip install missingno
url = "https://raw.githubusercontent.com/toshihiroryuu/Dataset-test/main/Creidit_Card.csv"
df = pd.read_csv(url)
df.head(5)
# identify the size of data frame
print("Size of the data frame is",df.size)
# identify the shape of the data frame
print("Shape of the data frame is",df.shape)
print("No of Rows = ", df.shape[0])
print("No of Columns = ", df.shape[1])
df.describe()
missing_bar = msno.bar(df, figsize=(6, 3), fontsize=12, color='magenta')
print(missing_bar)
miss_count = df.isnull().sum().sum()
print("No of missing values is",miss_count)
# Get rows where the data is missing
null_data = df[df.isnull().any(axis=1)]
null_data.head(5)
df = df.dropna()
null_data = df[df.isnull().any(axis=1)]
df.info()
# ! pip install autoviz
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
dff = AV.AutoViz(url)
df.drop(['CUST_ID'], axis=1, inplace=True)
df.head(5)
X = np.asarray(df)
from sklearn.preprocessing import StandardScaler
#standard scaler
scale = StandardScaler()
X = scale.fit_transform(X)
X.shape
from sklearn.cluster import KMeans
cost=[]
n_clusters = 30
for i in range(1, n_clusters):
kmean= KMeans(i)
kmean.fit(X)
cost.append(kmean.inertia_)
import matplotlib.pyplot as plt
plt.plot(cost, 'bx-')
plt.xlabel("K-Value")
plt.ylabel("Cost")
plt.show()
We can choose k value to be 6
kmean= KMeans(6)
kmean.fit(X)
kmean.labels_
clusters = pd.concat([df, pd.DataFrame({'cluster':kmean.labels_})], axis=1)
clusters.head(5)
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
plt.title("Balance vs Purchases")
plt.xlabel('Balance')
plt.ylabel('Purchases')
plt.scatter("BALANCE", "PURCHASES", data = clusters[clusters.cluster == 0], color = "green")
plt.scatter("BALANCE", "PURCHASES", data = clusters[clusters.cluster == 1], color = "red")
plt.show()
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
plt.title("Balance vs Payments")
plt.xlabel('Balance')
plt.ylabel('Payments')
plt.scatter("BALANCE", "PAYMENTS", data = clusters[clusters.cluster == 2], color = "blue")
plt.scatter("BALANCE", "PAYMENTS", data = clusters[clusters.cluster == 3], color = "magenta", marker="x" )
plt.show()
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
plt.title("Balance vs One Off Purchases")
plt.xlabel('Balance')
plt.ylabel('One Off Purchases')
plt.scatter("BALANCE", "ONEOFF_PURCHASES", data = clusters[clusters.cluster == 4], color = "blue")
plt.scatter("BALANCE", "ONEOFF_PURCHASES", data = clusters[clusters.cluster == 5], color = "green", marker="*" )
plt.show()
from sklearn.mixture import GaussianMixture
y_pred = GaussianMixture(n_components = 6, random_state=42).fit(X).predict(X)
mapping = np.array([0, 1, 2, 3, 4, 5])
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
y_pred
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 0")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 1")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 2")
plt.plot(X[y_pred==3, 2], X[y_pred==3, 3], "ro", label="Cluster 3")
plt.plot(X[y_pred==4, 2], X[y_pred==4, 3], "rs", label="Cluster 4")
plt.plot(X[y_pred==5, 2], X[y_pred==5, 3], "v", label="Cluster 5")
plt.xlabel("X", fontsize = 14)
plt.ylabel("Y", fontsize = 14)
plt.legend(loc = "upper left", fontsize = 14)
plt.show()
# ! pip install pyclustertend
from pyclustertend import hopkins
print(X.shape[0])
hopkins(X, X.shape[0])
The null hypothesis (no meaningfull cluster) happens when the hopkins test is around 0.5 and the hopkins test tends to 0 when meaningful cluster exists in the space. Usually, we can believe in the existence of clusters when the hopkins score is bellow 0.25
Here the value is 0.035 which is very much close to zero implying that the data has significant clustering tendencies.
Data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
# ! pip install missingno
url = "https://raw.githubusercontent.com/toshihiroryuu/Dataset-test/main/E-Commerce_Market_Basket.csv"
df = pd.read_csv(url, encoding='ISO-8859-1')
# df = pd.read_csv(url, encoding='ISO-8859-1', nrows=60000)
df.head(5)
# identify the size of data frame
print("Size of the data frame is",df.size)
# identify the shape of the data frame
print("Shape of the data frame is",df.shape)
print("No of Rows = ", df.shape[0])
print("No of Columns = ", df.shape[1])
df.describe()
missing_bar = msno.bar(df, figsize=(6, 3), fontsize=12, color='magenta')
print(missing_bar)
Customer ID's are not important so lets just ignore them.
# ! pip install autoviz
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
dff = AV.AutoViz(url)
df.drop("StockCode", axis=1, inplace=True)
df.drop("InvoiceDate", axis=1, inplace=True)
df.drop("UnitPrice", axis=1, inplace=True)
df.drop("CustomerID", axis=1, inplace=True)
df.head(5)
df.Country.unique()
basket = (df[df['Country'] =="Germany"]
.groupby(['InvoiceNo', 'Description'])['Quantity']
.sum().unstack().reset_index().fillna(0)
.set_index('InvoiceNo'))
print(basket.shape)
basket.head()
def encode_order(x):
if x <= 0:
return 0
if x >= 1:
return 1
basket_sets = basket.applymap(encode_order)
basket_sets.drop('POSTAGE', inplace=True, axis=1)
# ! pip install mlxtend
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(basket_sets, min_support=0.08, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets
Frequent items in the dataset are listed above.
from mlxtend.frequent_patterns import apriori, association_rules
frq_items = apriori(basket_sets, min_support = 0.08, use_colnames = True)
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()
The above table shows the support, confidence and lift of the frequent pairs of items.
The dataset contains the stock market prices(High, low, open, close) and trade volumes of IBM for 6 Years(2013 to 2018).
import pandas as pd
url = "https://raw.githubusercontent.com/toshihiroryuu/Dataset-test/8b54c0c7a70e7482b51c8df525cd4fbd910b2fe0/IBM_data.csv"
df = pd.read_csv(url, parse_dates=['date'])
df.head()
# identify the size of data frame
print("Size of the data frame is",df.size)
# identify the shape of the data frame
print("Shape of the data frame is",df.shape)
print("No of Rows = ", df.shape[0])
print("No of Columns = ", df.shape[1])
df.describe()
df.info()
import missingno as msno
# ! pip install missingno
miss_count = df.isnull().sum().sum()
print("No of missing values is",miss_count)
import matplotlib.pyplot as plt
plt.figure(figsize=(16,5), dpi=100)
plt.plot(df.date, df["high"], color='tab:red')
plt.plot(df.date, df["low"], color='tab:blue')
plt.gca().set(title="IBM High, Low vs Date Price", xlabel='Date', ylabel='High - Low Price')
plt.show()
plt.figure(figsize=(16,5), dpi=100)
plt.plot(df.date, df["open"], color='tab:green')
plt.plot(df.date, df["close"], color='tab:red')
plt.gca().set(title="IBM Open, Close Price vs Date", xlabel='Date', ylabel='Open - Close Price')
plt.show()
df['year'] = [d.year for d in df.date]
df['month'] = [d.strftime('%b') for d in df.date]
years = df['year'].unique()
df.head(2)
import seaborn as sns
df = pd.read_csv(url, parse_dates=['date'])
df.reset_index(inplace=True)
df['year'] = [d.year for d in df.date]
df['month'] = [d.strftime('%b') for d in df.date]
years = df['year'].unique()
fig, axes = plt.subplots(1, 2, figsize=(20,7), dpi= 80)
sns.boxplot(x='year', y='high', data=df, ax=axes[0])
sns.boxplot(x='month', y='high', data=df.loc[~df.year.isin([1991, 2008]), :])
axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=18);
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=18)
plt.show()
We can observe seasonality from the data for sure. The trend over the year keeps on decresing and increasing. There is no specific trend to it. The 2015 falsh crash can also be seen in the historical data.
from statsmodels.tsa.stattools import adfuller
df = pd.read_csv(url, parse_dates=['date'])
result = adfuller(df.high.values, autolag='AIC')
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
print('Critial Values:')
print(f' {key}, {value}')
Since p-value(0.31) is higher than significance level(0.05), therefore time series is non-stationary.
import numpy as np, pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(16,12), 'figure.dpi':120})
df = pd.read_csv(url, parse_dates=['date'])
fig, axes = plt.subplots(3, 2, sharex=True)
axes[0, 0].plot(df.high); axes[0, 0].set_title('Original Series')
plot_acf(df.high, ax=axes[0, 1])
axes[1, 0].plot(df.high.diff()); axes[1, 0].set_title('1st Order Differencing')
plot_acf(df.high.diff().dropna(), ax=axes[1, 1])
axes[2, 0].plot(df.high.diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
plot_acf(df.high.diff().diff().dropna(), ax=axes[2, 1])
plt.show()
from statsmodels.tsa.stattools import adfuller
from numpy import log
result = adfuller(df.high.diff().dropna())
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
from statsmodels.tsa.stattools import adfuller
from numpy import log
result = adfuller(df.high.diff().diff().dropna())
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
The time series reaches stationarity with just one differencing. Since p-value of first order differnceing is 0.00, we can choose the order of differencing to be 1 for now.
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})
fig, axes = plt.subplots(1, 2, sharex=True)
axes[0].plot(df.high.diff()); axes[0].set_title('1st Differencing')
axes[1].set(ylim=(0,5))
plot_pacf(df.high.diff().dropna(), ax=axes[1])
plt.show()
We choose the p value to be 1.
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})
df = pd.read_csv(url, header=0)
fig, axes = plt.subplots(1, 2, sharex=True)
axes[0].plot(df.high.diff()); axes[0].set_title('1st Differencing')
plot_acf(df.high.diff().dropna(), ax=axes[1])
plt.show()
We choose the q value to be 1
from statsmodels.tsa.arima_model import ARIMA
# 1,1,1 ARIMA Model
model = ARIMA(df.high, order=(1,1,1))
model_fit = model.fit(disp=0)
print(model_fit.summary())
residuals = pd.DataFrame(model_fit.resid)
fig, ax = plt.subplots(1,2)
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()
We can observe 0 mean and uinform variance.
# Actual vs Fitted data
model_fit.plot_predict(dynamic=False)
plt.show()
from statsmodels.tsa.stattools import acf
# Train test split
train = df.high[:1200]
test = df.high[1200:]
# Build Model
model = ARIMA(train, order=(1, 1, 1))
fitted = model.fit(disp=-1)
print(fitted.summary())
# Forecast
fc, se, conf = fitted.forecast(59, alpha=0.05) # 95% conf
# Make as pandas series
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)
# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series,
color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()
import numpy as np
def forecast_accuracy(forecast, actual):
mape = np.mean(np.abs(forecast - actual)/np.abs(actual)) # MAPE
me = np.mean(forecast - actual) # ME
mae = np.mean(np.abs(forecast - actual)) # MAE
mpe = np.mean((forecast - actual)/actual) # MPE
rmse = np.mean((forecast - actual)**2)**.5 # RMSE
corr = np.corrcoef(forecast, actual)[0,1] # corr
mins = np.amin(np.hstack([forecast[:,None],
actual[:,None]]), axis=1)
maxs = np.amax(np.hstack([forecast[:,None],
actual[:,None]]), axis=1)
minmax = 1 - np.mean(mins/maxs) # minmax
acf1 = acf(fc-test)[1] # ACF1
return({'mape':mape, 'me':me, 'mae': mae,
'mpe': mpe, 'rmse':rmse, 'acf1':acf1,
'corr':corr, 'minmax':minmax})
forecast_accuracy(fc, test.values)
# Build Model
model = ARIMA(train, order=(1, 2, 1))
fitted = model.fit(disp=-1)
print(fitted.summary())
# Forecast
fc, se, conf = fitted.forecast(59, alpha=0.05) # 95% conf
# Make as pandas series
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)
# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series,
color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()
import numpy as np
forecast_accuracy(fc, test.values)
# ! pip install pmdarima
# ! pip install statsmodels
from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm
import pandas as pd
df = pd.read_csv(url, header=0)
model_arima= pm.auto_arima(df.high,trace=True, error_action='ignore',
test='adf',start_p=1,start_q=1,max_p=5,max_q=5,d=None,
suppress_warnings=True,stepwise=False,seasonal=False)
print(model_arima.summary())
model_arima.plot_diagnostics(figsize=(15, 5))
plt.show()
n_periods = 24
fc, confint = model_arima.predict(n_periods = n_periods, return_conf_int=True)
index_of_fc = np.arange(len(df.high), len(df.high)+n_periods)
fc_series = pd.Series(fc, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
plt.plot(df.high)
plt.plot(fc_series, color='darkgreen')
plt.fill_between(lower_series.index,
lower_series,
upper_series,
color='k', alpha=.15)
plt.title("Final Forecast of Price")
plt.show()
From Automatic ARIMA, the best model was found to be (1,1,1), which confirms our previous findings with AIC value of 4978.615
Thank You